1 What is done here?

  • Types of bound genes
  • Distribution of binding sites over gene regions (with and without normalisation for region length)
  • Distribution of crosslink event over gene regions, metagene profile
  • Distribution of binding sites per gene and in relation to TPM
  • Relative maps of crosslink distribution
  • GeneOntology and REACTOME enrichment analysis

2 What is done?

  • this is an additional analysis for the reply to the reviewers
  • comparison of the differential protein expression from the same PURA proteomics experiment with three different methods of protein discovery from the measured peptides
  • the thermofisher proteome discoverer is compared to Maxquant
  • moreover Maxquant results with and without isobaric matching are compared (isobaric matching matched ions of similar run-times if no unique peptide is found)
  • I also look at the discovered proteins for PURA and PURB depending on these methods

3 Input

# proteomics table
proteomics_tables <- list( thermo_fisher = readxl::read_excel("/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/04-BioID-SILAC/03-Proteomics/data-without-outlier-rep-3/MOL13920_noRepl3_SwissHum_perc_precDet_int_ratio_TTEST_Peptide-high_proteins.xlsx", sheet = 2),
                           maxquant_matching = readxl::read_excel("/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/04-BioID-SILAC/03-Proteomics/XX_reprocessed_data/MOL13920_wo-repl3_MQ1670_withMBR_proteinGroups_LFQ1_matching.xlsx"),
                          maxquant_no_matching = readxl::read_excel("/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/04-BioID-SILAC/03-Proteomics/XX_reprocessed_data/MOL13920_wo-repl3_MQ1670_noMBR_proteinGroups_LFQ1.xlsx"))


p_cut_prot <- 0.05

4 Clean up count tabels

4.1 Number of proteins detected with at least one unique peptide

map(proteomics_tables, ~nrow(.x))
## $thermo_fisher
## [1] 5606
## 
## $maxquant_matching
## [1] 5040
## 
## $maxquant_no_matching
## [1] 5023

4.2 How are proteins matched to genes?

proteomics_tables[[1]] <- proteomics_tables[[1]] %>% dplyr::rename(., gene_name = `Gene Symbol`)
proteomics_tables[[2]] <- proteomics_tables[[2]] %>% dplyr::rename(., gene_name = `Gene symbol`)
proteomics_tables[[3]] <- proteomics_tables[[3]] %>% dplyr::rename(., gene_name = `Gene symbol`)

Found proteins, that are not matched to any gene:

map(proteomics_tables, ~sum(is.na(.x$gene_name)))
## $thermo_fisher
## [1] 0
## 
## $maxquant_matching
## [1] 91
## 
## $maxquant_no_matching
## [1] 0

Genes, that are matched to multiple proteins:

map(proteomics_tables, ~sum(duplicated(.x$gene_name)))
## $thermo_fisher
## [1] 3
## 
## $maxquant_matching
## [1] 93
## 
## $maxquant_no_matching
## [1] 3
map(proteomics_tables, ~.x[duplicated(.x$gene_name),]$gene_name)
## $thermo_fisher
## [1] "SMCR7L; MIEF1" "CDKN2A"        "TMPO"         
## 
## $maxquant_matching
##  [1] "CDKN2A" "RAB34"  "TMPO"   NA       NA       NA       NA       NA      
##  [9] NA       NA       NA       NA       NA       NA       NA       NA      
## [17] NA       NA       NA       NA       NA       NA       NA       NA      
## [25] NA       NA       NA       NA       NA       NA       NA       NA      
## [33] NA       NA       NA       NA       NA       NA       NA       NA      
## [41] NA       NA       NA       NA       NA       NA       NA       NA      
## [49] NA       NA       NA       NA       NA       NA       NA       NA      
## [57] NA       NA       NA       NA       NA       NA       NA       NA      
## [65] NA       NA       NA       NA       NA       NA       NA       NA      
## [73] NA       NA       NA       NA       NA       NA       NA       NA      
## [81] NA       NA       NA       NA       NA       NA       NA       NA      
## [89] NA       NA       NA       NA       NA      
## 
## $maxquant_no_matching
## [1] "CDKN2A" "RAB34"  "TMPO"

Removing both duplicated genes and unknown genes

proteomics_tables <- lapply(proteomics_tables, function(x){
  x <- x[!is.na(x$gene_name),]
  x <- x[!duplicated(x$gene_name),]
})

4.3 Overlap of found genes, that the proteins belong to

all = c(proteomics_tables[[1]]$gene_name, proteomics_tables[[2]]$gene_name, proteomics_tables[[3]]$gene_name) %>% unique() %>% na.omit()

overlap_mat <- cbind(
thermo_fisher = all %in% proteomics_tables[[1]]$gene_name,
maxquant_matching = all  %in% proteomics_tables[[2]]$gene_name,
maxquant_no_matching = all  %in% proteomics_tables[[3]]$gene_name)

fit <- eulerr::euler(overlap_mat)

plot(fit,  quantities = TRUE)

head(all[!(all  %in% proteomics_tables[[2]]$gene_name)])
## [1] "STARD7" "ASDURF" "BECN1"  "BRK1"   "CHCHD5" "CHST14"

5 Normalised abundances

# decoy matches and matches to contaminant should be removed --> already happened?
# any(proteomics_tables[[2]]$Reverse == "+", na.rm = T)
# any(proteomics_tables[[2]]$`Potential contaminant` == "+", na.rm = T)
# 
# any(proteomics_tables[[3]]$Reverse == "+", na.rm = T)
# any(proteomics_tables[[3]]$`Potential contaminant` == "+", na.rm = T)


# LFQ table
LFQs <- list(thermo_fisher = proteomics_tables[[1]][,c(grepl(pattern = "(Normalized)", colnames(proteomics_tables[[1]])) |
                                                         grepl(pattern = "gene_name", colnames(proteomics_tables[[1]])))] %>%
               as.data.frame(),
              maxquant_matching = proteomics_tables[[2]][,c(grepl(pattern = "LFQ", colnames(proteomics_tables[[2]]))|
                                                         grepl(pattern = "gene_name", colnames(proteomics_tables[[2]])))] %>%
               as.data.frame(),
             maxquant_no_matching = proteomics_tables[[3]][,c(grepl(pattern = "LFQ", colnames(proteomics_tables[[3]]))|
                                                         grepl(pattern = "gene_name", colnames(proteomics_tables[[3]])))] %>%
               as.data.frame())


LFQs <- lapply(LFQs, function(x){
  colnames(x) <- c("gene_name", "kd1", "kd2", "kd3", "kd4", "wt1", "wt2", "wt3", "wt4")
  return(x)})

LFQs_t <- map(LFQs, ~reshape2::melt(.x))

LFQs_t2 <- full_join(LFQs_t[[1]], LFQs_t[[2]], by = c("gene_name", "variable"), suffix = c(".tf", ".mq_ma"))
LFQs_t2 <- full_join(LFQs_t2, LFQs_t[[3]], by = c("gene_name", "variable"), suffix = c("", ".mq_noma"))

LFQs_t3 <- reshape2::melt(LFQs_t2)

colnames(LFQs_t3) <- c("gene_name", "sample", "processing", "abundance")


ggplot(LFQs_t2, aes(x=log10(value.tf), y= log10(value.mq_ma)))+
  xlim(5,11)+
  ylim(5,11)+
  ggrastr::rasterise(geom_point(size = 0.1), dpi = 400)+
  ggrastr::rasterise(ggpointdensity::geom_pointdensity(size = 0.1), dpi = 400)+
  xlab("thermofisher abundances")+
  ylab("maxquant - matching abundances")+
  geom_point(data=LFQs_t2[(LFQs_t2$gene_name == "PURA" & grepl(LFQs_t2$variable, pattern = "kd")), ], aes(x=log10(value.tf), y= log10(value.mq_ma)), color = "orange", size = 0.5)+
  geom_point(data=LFQs_t2[(LFQs_t2$gene_name == "PURA" & grepl(LFQs_t2$variable, pattern = "wt")), ], aes(x=log10(value.tf), y= log10(value.mq_ma)), color = "red", size = 0.5)+
  theme_paper()+
  theme(legend.key.size = unit(0.15, 'cm'))

#ggsave(paste0(outpath, "Scatter_tf_MQ.pdf"), width = 7, height = 6, units = "cm")

ggplot(LFQs_t2, aes(x=log10(value), y= log10(value.mq_ma)))+
  ggrastr::rasterise(geom_point(size = 0.1), dpi = 400)+
  ggrastr::rasterise(ggpointdensity::geom_pointdensity(size = 0.1), dpi = 400)+
  xlab("maxquant - no matching abundances")+
  ylab("maxquant - matching abundances")+
  geom_point(data=LFQs_t2[(LFQs_t2$gene_name == "PURA" & grepl(LFQs_t2$variable, pattern = "wt")), ], aes(x=log10(value), y= log10(value.mq_ma)), color = "red", size = 0.5)+
  geom_point(data=LFQs_t2[(LFQs_t2$gene_name == "PURA" & grepl(LFQs_t2$variable, pattern = "kd")), ], aes(x=log10(value), y= log10(value.mq_ma)), color = "orange", size = 0.5)+
  theme_paper()+
  theme(legend.key.size = unit(0.15, 'cm'))

#ggsave(paste0(outpath, "Scatter_MQ_match_no_match.pdf"), width = 7, height = 6, units = "cm")

5.1 Excluding to high fallout rates

–> We exclude proteins with 2 or 3 missing values in either of the conditions

# turn 0 into NA
LFQs[[1]][LFQs[[1]]==0] = NA
LFQs[[2]][LFQs[[2]]==0] = NA
LFQs[[3]][LFQs[[3]]==0] = NA


# sum fall outs
LFQs <- lapply(LFQs, function(x){
  x$na_kd <- apply(x,1, function(y) sum(is.na(y[2:5])))
  x$na_wt <- apply(x,1, function(y) sum(is.na(y[6:9])))
  return(x)
})

LFQ_filt <- LFQs %>% map(., ~dplyr::filter(.x, (na_wt %in% c(0,1)) & (na_kd %in% c(0,1))))

LFQ_4_zeros <- LFQs %>% map(., ~dplyr::filter(.x, (((na_wt == 4) & (na_kd %in% c(0,1)))| ((na_kd == 4)& (na_wt %in% c(0,1))))))

#saveRDS(LFQ_filt, paste0(outpath, "LFQ_tables.rds"))

Number of Proteins after zero filtering: (max one missing value per condition)

map(LFQ_filt, ~ nrow(.x))
## $thermo_fisher
## [1] 5373
## 
## $maxquant_matching
## [1] 4453
## 
## $maxquant_no_matching
## [1] 3537

Proteins with fall-out only in one condition: (4 zeros in one condition and max 1 zero in other condition)

map(LFQ_4_zeros, ~ .x$gene_name)
## $thermo_fisher
##  [1] "STARD7"              "EEF2K; LOC101930123" "GMFB"               
##  [4] "LMTK2"               "MIF4GD"              "RAD51AP2"           
##  [7] "SDF2"                "CYB561"              "DHRS11"             
## [10] "FYCO1"               "KLC4"                "KPRP"               
## [13] "OXNAD1"              "SLC26A6"             "TMEM87B"            
## 
## $maxquant_matching
##  [1] "ADAM17"   "ARHGAP18" "ARHGAP29" "ATP11A"   "BNIP1"    "C1orf122"
##  [7] "CAPN15"   "CDT1"     "CGNL1"    "CNOT8"    "CYFIP2"   "DHX33"   
## [13] "DTL"      "DUSP27"   "ENG"      "FAM71B"   "GDAP2"    "GOLGA7"  
## [19] "GPN3"     "H2AC20"   "HAUS2"    "HDAC6"    "KDM5B"    "KIF20A"  
## [25] "MDFIC"    "MGME1"    "MRPL54"   "MTERF3"   "NCOA3"    "NDUFAF6" 
## [31] "NOP10"    "NUDT3"    "OGA"      "PCDH1"    "PIK3C2A"  "PKP1"    
## [37] "PM20D2"   "PPIL3"    "PPTC7"    "PRMT6"    "PRORSD1P" "PTGR2"   
## [43] "SELENOK"  "SERF2"    "SLC2A4"   "TRIM16"   "UBE2E3"   "UBR7"    
## [49] "YPEL5"   
## 
## $maxquant_no_matching
##  [1] "COX7C"   "PURA"    "AAK1"    "ACAD11"  "ADI1"    "APLP2"   "ATF1"   
##  [8] "ATF7"    "B4GALT7" "C1QTNF6" "CAP2"    "CASP2"   "CBFB"    "CCNH"   
## [15] "CD47"    "CDT1"    "CMC1"    "CNOT11"  "COMMD10" "CUL5"    "CYFIP2" 
## [22] "CYP2S1"  "DCAF7"   "DHX33"   "DUSP27"  "EFEMP1"  "ELOF1"   "EPM2A"  
## [29] "EPN2"    "FAM71B"  "FGFR1OP" "GINS1"   "GLMP"    "GMEB2"   "HDAC6"  
## [36] "JPH1"    "KANK2"   "LAMA5"   "LAMC1"   "LMAN2L"  "LMNB2"   "LRP8"   
## [43] "LZTFL1"  "MASTL"   "MINDY4"  "MSH5"    "MSTO1"   "NAGLU"   "NEK5"   
## [50] "NFYA"    "NMI"     "NTHL1"   "NUDT3"   "OGA"     "OSBPL11" "OSGEP"  
## [57] "P3H4"    "PEF1"    "PEX3"    "PI4KA"   "PIK3C2A" "PIK3C2B" "PLXNA2" 
## [64] "PPP6R3"  "PSEN2"   "RAP2B"   "RPIA"    "RWDD1"   "S100A14" "SCAP"   
## [71] "SFT2D1"  "SH3YL1"  "SIRT5"   "SLC44A1" "SLC6A8"  "SLC7A6"  "SMAP1"  
## [78] "SMIM7"   "SPAG5"   "SPRYD7"  "SRGAP2"  "ST7"     "TBC1D5"  "TMEM41A"
## [85] "TMEM63B" "TRIP11"  "UQCC2"   "VPS28"   "VPS51"   "VPS53"   "WDR45B" 
## [92] "YRDC"    "ZBTB11"  "ZNF830"  "ZNHIT2"  "ZPR1"
map(LFQ_4_zeros, ~ nrow(.x))
## $thermo_fisher
## [1] 15
## 
## $maxquant_matching
## [1] 49
## 
## $maxquant_no_matching
## [1] 96

6 Differential analysis

There are multiple options for differential analysis

  • Thermo fisher in build analysis
  • limma
  • DEqMs
  • Perseus

6.1 Is data median centered?

Use boxplot to check if the samples have medians centered. if not, do median centering.

–> yes

###################
# function DEqMS
###################

f_DEqMS <- function(protein_matrix, pep_count_table){

# model
##################
condition = as.factor(c(rep("kd",4), rep("wt",4)))
design = model.matrix(~0+condition)

contrasts <- c("conditionkd-conditionwt")
contrast <- makeContrasts(contrasts=contrasts, levels = design)

# fit model to matrix
########################

fit1 = lmFit(protein_matrix, design = design)
fit2 = contrasts.fit(fit1, contrasts = contrast)
fit3 <- eBayes(fit2)


# correct bias of variance estimate
####################################

fit3$count = pep_count_table[rownames(fit3$coefficients),"count"]


fit4 <- spectraCounteBayes(fit3)

VarianceBoxplot(fit4, main = "Fit of variance estimate",
                xlab="peptide count + 1")

VarianceScatterplot(fit4, main = "Fit of variance estimate",
                xlab="peptide count + 1")


# adding gene info to results
DEqMS.results = outputResult(fit4, coef_col = 1)

DEqMS.results$gene_name <- rownames(DEqMS.results)

return(DEqMS.results)
}

DEqMS is based on limma, but adjustes the p-values in the end for the variance given for the specific number of unique peptides found

6.1.1 Fit variance to peptide count in DEqMS

#####################
# peptide count tables
########################
# Maxquant: we use minimum peptide count among six samples
# count unique+razor peptides used for quantification

pep.count.table = list( thermo_fisher = data.frame(count = proteomics_tables[[1]]$`# Unique Peptides`,
                             row.names = proteomics_tables[[1]]$gene_name),
  maxquant_mat = data.frame(count = rowMins(as.matrix(proteomics_tables[[2]][,20:27])),
                             row.names = proteomics_tables[[2]]$gene_name),
  maxquant_nomat = data.frame(count = rowMins(as.matrix(proteomics_tables[[3]][,20:27])),
                             row.names = proteomics_tables[[3]]$gene_name))
# Minimum peptide count of some proteins can be 0
# add pseudocount 1 to all proteins
pep.count.table = map(pep.count.table, ~mutate(.x, count = count+1))

# run
rownames(protein.matrix[[1]]) = LFQ_filt[[1]]$gene_name
rownames(protein.matrix[[2]]) = LFQ_filt[[2]]$gene_name
rownames(protein.matrix[[3]]) = LFQ_filt[[3]]$gene_name

deqms <- map2(protein.matrix, pep.count.table, ~ f_DEqMS(protein_matrix = .x, pep_count_table = .y))

6.2 Vulcanos

titles <- as.list(names(deqms))

map2(deqms, titles, ~ggplot(.x, aes(x = logFC, y = (-log10(sca.adj.pval))))+
   geom_point(color ="grey", shape =1)+
   geom_point(data = .x[(.x$sca.adj.pval< 0.05) & (abs(.x$logFC) > log(0.5)), ],
              aes(x= logFC, y = (-log10(sca.adj.pval))))+
   geom_point(data = .x[.x$gene_name %in% c("CTNNA1", "SQSTM1", "LSM14A", "RBFOX2", "DDX6", "GAPDH", "TUBB", "PURA", "PURB"),],
              aes(x= logFC, y = -log10(sca.adj.pval)), color = "orange")+
  ggrepel::geom_label_repel( data = .x[.x$gene_name %in% c("CTNNA1", "SQSTM1", "LSM14A", "RBFOX2", "DDX6", "GAPDH", "TUBB","PURA", "PURB"),],
                             aes( x = logFC,  y = (-log10(sca.adj.pval)), label = gene_name), max.overlaps = 10)+
   xlab("Protein fold change (kd/wt) [log2]")+
   ylab("adj. p-value (protein fold change)")+
  theme_paper()+
   theme(legend.position = "none", aspect.ratio = 1/1)+
    ggtitle(paste(.y, "\n DEqMS differential analysis")))
## $thermo_fisher

## 
## $maxquant_matching

## 
## $maxquant_no_matching

6.3 default differential analysis by Thermofisher result

–> striped

ggplot(proteomics_tables[[1]], aes(x = log2(`Abundance Ratio: (PurA-KD) / (WT)`), y = `Exp. q-value: Combined`))+
         geom_point(color ="grey", shape =1)+
   geom_point(data = proteomics_tables[[1]][(proteomics_tables[[1]]$`Exp. q-value: Combined`< 0.05) & (abs(log2(proteomics_tables[[1]]$`Abundance Ratio: (PurA-KD) / (WT)`)) > log(0.5)), ],
              aes(x=log2(`Abundance Ratio: (PurA-KD) / (WT)`), y = (-log10(`Exp. q-value: Combined`))))+
   geom_point(data = proteomics_tables[[1]][proteomics_tables[[1]]$gene_name %in% c("CTNNA1", "SQSTM1", "LSM14A", "RBFOX2", "DDX6", "GAPDH", "TUBB", "PURA", "PURB"),],
              aes(x=log2(`Abundance Ratio: (PurA-KD) / (WT)`), y = -log10(`Exp. q-value: Combined`)), color = "orange")+
  ggrepel::geom_label_repel( data = proteomics_tables[[1]][proteomics_tables[[1]]$gene_name %in% c("CTNNA1", "SQSTM1", "LSM14A", "RBFOX2", "DDX6", "GAPDH", "TUBB","PURA", "PURB"),],
                             aes( x =log2(`Abundance Ratio: (PurA-KD) / (WT)`),  y = (-log10(`Exp. q-value: Combined`)), label = gene_name), max.overlaps = 10)+
   xlab("Protein fold change (kd/wt) [log2]")+
   ylab("adj. p-value (protein fold change)")+
  theme_paper()+
   theme(legend.position = "none", aspect.ratio = 1/1)+
    ggtitle(paste(" Detection & quantification from pipeline"))

7 Why are the results for PURA and PURB differnet for the different preprocessings?

7.1 Countplots PURA, PURB

###############
# countplot PURA
#################
LFQ_filt_PURA <- map(LFQ_filt, ~.x[.x$gene_name=="PURA",])

LFQ_filt_PURA <- map(LFQ_filt_PURA, ~reshape2::melt(.x))

LFQ_filt_PURA <- map(LFQ_filt_PURA, ~.x[1:8,]) %>% map(., ~mutate(.x, condition = substr(variable,1,2)))

LFQ_filt_PURA_no_ma <- LFQ_4_zeros[[3]][LFQ_4_zeros[[3]]$gene_name=="PURA",] %>% reshape2::melt()
LFQ_filt_PURA_no_ma <- LFQ_filt_PURA_no_ma[1:8,] %>% mutate(., condition = substr(variable,1,2))

LFQ_filt_PURA[[1]]$processing <- "thermo fisher"
LFQ_filt_PURA[[2]]$processing <- "maxquant + matching"
LFQ_filt_PURA_no_ma$processing <- "maxquant, no matching"

gg_df <- rbind(LFQ_filt_PURA[[1]], LFQ_filt_PURA[[2]], LFQ_filt_PURA_no_ma)

ggplot(gg_df, aes(x = variable, y = log(value), color = processing))+
  geom_point()+
  ggtitle("Intensities of PURA from thermo fisher or maxquant processing")+
  theme_paper()+
  theme(panel.grid.major =  element_line(color = "lightgrey", size = 0.25),
        legend.position = "None")

ggplot(gg_df, aes(x = variable, y = log(value), color = processing))+
  geom_point()+
  theme_paper()+
  theme(panel.grid.major =  element_line(color = "lightgrey", size = 0.25),
        legend.position = "None")

#ggsave(filename = paste0(outpath, "protein_counts.pdf"), width = 6, height = 6, units = "cm")

###############
# countplot PURB
#################

LFQ_filt_PURB <- map(LFQ_filt, ~.x[.x$gene_name=="PURB",])

LFQ_filt_PURB <- map(LFQ_filt_PURB, ~reshape2::melt(.x))

LFQ_filt_PURB <- map(LFQ_filt_PURB, ~.x[1:8,]) %>% map(., ~mutate(.x, condition = substr(variable,1,2)))

# LFQ_filt_PURB_no_ma <- LFQ_4_zeros[[3]][LFQ_4_zeros[[3]]$gene_name=="PURB",] %>% reshape2::melt()
# LFQ_filt_PURB_no_ma <- LFQ_filt_PURB_no_ma[1:8,] %>% mutate(., condition = substr(variable,1,2))

LFQ_filt_PURB[[1]]$processing <- "thermo fisher"
LFQ_filt_PURB[[2]]$processing <- "maxquant + matching"
LFQ_filt_PURB[[3]]$processing <- "maxquant, no matching"

gg_df <- rbind(LFQ_filt_PURB[[1]], LFQ_filt_PURB[[2]], LFQ_filt_PURB[[3]])

ggplot(gg_df, aes(x = variable, y = log(value), color = processing))+
  geom_point()+
  ggtitle("Intensities of PURB from thermo fisher or maxquant processing")

#LFQs[[3]][LFQs[[3]]$gene_name=="PURB",]

–> different quantification of proteins

8 Look at PURA and PURB found peptides

peptides_tf <- readxl::read_excel("/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/04-BioID-SILAC/03-Proteomics/peptide_lists/MOL13920/MOL13920_noRepl3_SwissHum_perc_precDet_int_ratio_TTEST_Peptide-high_peptides.xlsx")

peptides_mq <- readxl::read_excel("/Users/melinaklostermann/Documents/projects/PURA/02_R_new_pip/04-BioID-SILAC/03-Proteomics/peptide_lists/MOL13920/MOL13920_wo-repl3_MQ1670_withMBR_peptides_LFQ1.xlsx")

PURA_protein_id <- "Q00577"
PURB_protein_id <- "Q96QR8"

# peptides maxquant
peptides_mq <- peptides_mq %>% 
  rowwise() %>%
  mutate(., protein_id = unlist(strsplit(`Leading razor protein`, split = "|", fixed = T ))[2],
         protein_name = unlist(strsplit(`Leading razor protein`, split = "|", fixed = T))[3],
         protein_name = unlist(strsplit(protein_name, split = "_", fixed = T))[1]) %>% as.data.frame(.)

peptides_PURA_mq <- peptides_mq %>% filter(protein_id == PURA_protein_id )
peptides_PURB_mq <- peptides_mq %>% filter(protein_id == PURB_protein_id )

# peptides thermo fisher
peptides_PURA_tf <- peptides_tf %>% filter(`Master Protein Accessions` == PURA_protein_id )
peptides_PURB_tf <- peptides_tf %>% filter(`Master Protein Accessions` == PURB_protein_id )

peptides_PURA_mq[,c("Sequence", "PEP")]
##                  Sequence        PEP
## 1               FFFDVGSNK 1.6019e-02
## 2 GPGLGSTQGQTIALPAQGLIEFR 1.8044e-05
peptides_PURA_tf[,c("Annotated Sequence", "Abundance Ratio: (PurA-KD) / (WT)", "Qvality PEP", "Qvality q-value" , "Percolator q-Value (by Search Engine): Sequest HT")]
## # A tibble: 3 × 5
##   `Annotated Sequence`           `Abundance Rat…` `Qvality PEP` `Qvality q-val…`
##   <chr>                                     <dbl>         <dbl>            <dbl>
## 1 [R].GPGLGSTQGQTIALPAQGLIEFR.[…            0.554 0.00000000464        0.0000267
## 2 [R].FFFDVGSNK.[Y]                         0.552 0.000998             0.000119 
## 3 [R].NSITVPYK.[V]                          1.12  0.0523               0.00446  
## # … with 1 more variable:
## #   `Percolator q-Value (by Search Engine): Sequest HT` <dbl>
peptides_PURB_mq[,c("Sequence", "PEP")]
##        Sequence        PEP
## 1 GGGEQETQELASK 0.00011241
## 2      NAITVPFK 0.03866400
peptides_PURB_tf[,c("Annotated Sequence", "Abundance Ratio: (PurA-KD) / (WT)", "Qvality PEP", "Qvality q-value" , "Percolator q-Value (by Search Engine): Sequest HT")]
## # A tibble: 2 × 5
##   `Annotated Sequence`  `Abundance Ratio: (PurA…` `Qvality PEP` `Qvality q-val…`
##   <chr>                                     <dbl>         <dbl>            <dbl>
## 1 [R].GGGEQETQELASK.[R]                     0.699      0.000171        0.0000504
## 2 [R].NAITVPFK.[A]                          0.694      0.0389          0.00339  
## # … with 1 more variable:
## #   `Percolator q-Value (by Search Engine): Sequest HT` <dbl>

8.1 Countplots of peptides

# PURA
gg_pura_mq <- peptides_PURA_mq[,(grepl(pattern="LFQ", colnames(peptides_PURA_mq))|
                   grepl(pattern="Sequence", colnames(peptides_PURA_mq)))] %>% reshape2::melt(.)
gg_pura_tf <- peptides_PURA_tf[,(grepl(pattern="Normalized", colnames(peptides_PURA_tf))|
                   grepl(pattern="Annotated Sequence", colnames(peptides_PURA_tf)))] %>% reshape2::melt(.)
gg_pura_tf <- gg_pura_tf %>% rowwise(.) %>% mutate(`Annotated Sequence` = unlist(strsplit(`Annotated Sequence`, split = ".", fixed =T))[2]) %>%
  dplyr::rename(Sequence = `Annotated Sequence`)

gg_pura_mq$condition = c(rep("kd1", 2), rep("kd2", 2), rep("kd3", 2), rep("kd4", 2),
                         rep("wt1", 2), rep("wt2", 2), rep("wt3", 2), rep("wt4", 2))

gg_pura_tf$condition = c(rep("kd1", 3), rep("kd2", 3), rep("kd3", 3), rep("kd4", 3),
                         rep("wt1", 3), rep("wt2", 3), rep("wt3", 3), rep("wt4", 3))

gg_pura_mq$processing = "maxquant"
gg_pura_tf$processing = "thermo fisher"


gg_pura <- rbind(gg_pura_mq, gg_pura_tf)

ggplot(gg_pura, aes(x = condition, y = log(value), color = processing ))+
  geom_point()+
  facet_wrap(~Sequence)+
  theme_paper()

# PURB
gg_purb_mq <- peptides_PURB_mq[,(grepl(pattern="LFQ", colnames(peptides_PURB_mq))|
                   grepl(pattern="Sequence", colnames(peptides_PURB_mq)))] %>% reshape2::melt(.)
gg_purb_tf <- peptides_PURB_tf[,(grepl(pattern="Normalized", colnames(peptides_PURB_tf))|
                   grepl(pattern="Annotated Sequence", colnames(peptides_PURB_tf)))] %>% reshape2::melt(.)
gg_purb_tf <- gg_purb_tf %>% rowwise(.) %>% mutate(`Annotated Sequence` = unlist(strsplit(`Annotated Sequence`, split = ".", fixed =T))[2]) %>%
  dplyr::rename(Sequence = `Annotated Sequence`)

gg_purb_mq$condition = c(rep("kd1", 2), rep("kd2", 2), rep("kd3", 2), rep("kd4", 2),
                         rep("wt1", 2), rep("wt2", 2), rep("wt3", 2), rep("wt4", 2))

gg_purb_tf$condition = c(rep("kd1", 2), rep("kd2", 2), rep("kd3", 2), rep("kd4", 2),
                         rep("wt1", 2), rep("wt2", 2), rep("wt3", 2), rep("wt4", 2))

gg_purb_mq$processing = "maxquant"
gg_purb_tf$processing = "thermo fisher"


gg_purb <- rbind(gg_purb_mq, gg_purb_tf)

ggplot(gg_purb, aes(x = condition, y = log(value), color = processing ))+
  geom_point()+
  facet_wrap(~Sequence)  

–> third peptide only detected in thermofisher preprocessing

8.2 countplots unique peptide vs matching

–> thrid peptide is only found once in thermofisher and then matched into all other samples, this did not happen for maxquant